In [3]:
from elpisciDSFuncs.scrnaseq import *
from nbdev.showdoc import show_doc
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [4]:
!ls
GSE160048_human_glom_single_cell_rpkms.format.tsv
GSE160048_human_glom_single_cell_rpkms.format.tsv.gz
GSE160048_human_glom_single_cell_rpkms.txt
GSE160048_human_glom_single_cell_rpkms.txt.gz
pmid33837218-GSE160048-0_GEO2h5ad.ipynb
pmid33837218-GSE160048-1_viz.ipynb
pmid33837218-GSE160048.h5ad
In [6]:
adata = sc.read('pmid33837218-GSE160048.h5ad')
adata
Out[6]:
AnnData object with n_obs × n_vars = 765 × 3872
    obs: 'n_genes_by_counts', 'total_counts', 'leiden', 'Cell_subtype'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Cell_subtype_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'
In [16]:
CLEC = []
for i in adata.raw.var_names:
    if 'CLEC' in i:
        CLEC.append(i)
In [18]:
# import matplotlib.pyplot as plt
plt.rcParams['axes.ymargin'] = 0
plt.rcParams['axes.spines.left']   = True
plt.rcParams['axes.spines.right']  = False
plt.rcParams['axes.spines.top']    = False
plt.rcParams['axes.spines.bottom'] = True
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
         adata,
        {'CLECs':CLEC}, 
        groupby='Cell_subtype',
        return_fig=True,#title='tissue:'+t,
        mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
#         dot_max=0.5,
#         dot_min=0,
#         vmin=0,
#         vmax=1.2,
    ).add_totals(color='black',size=0.5).show()
In [ ]:
sc.pl.dotplot(
         adata,'IL6',
        groupby='Cell_subtype',
#         return_fig=True,#title='tissue:'+t,
        mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
#         dot_max=0.5,
#         dot_min=0,
#         vmin=0,
#         vmax=1.2,
    )
sc.pl.dotplot(adata,['IL6','MKI67'],groupby='Cell_subtype',
              mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True,
        dot_max=0.15,
        dot_min=0,
             )
sc.pl.dotplot(
    adata,
    ['TNF','TNFRSF1A','TNFRSF1B','FCAR'],
    groupby='Cell_subtype',mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True)
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
         adata,'n_genes_by_counts', 
        groupby='Cell_subtype',
        return_fig=True,#title='tissue:'+t,
        mean_only_expressed=True
#         dot_max=0.5,
#         dot_min=0,
#         vmin=0,
#         vmax=1.2,
    ).add_totals(color='black',size=0.5).show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [2]:
# !head -5 GSE160048_human_glom_single_cell_rpkms.txt

格式化从 GEO 下载的原始数据

In [975]:
exp_matrix = pd.read_csv('GSE160048_human_glom_single_cell_rpkms.txt',index_col=False, nrows=None,sep='\t')
len(exp_matrix.keys())
# 767
exp_matrix.shape
# (16988, 767)
exp_matrix['SYMBOL'] = exp_matrix['Unnamed: 0'].apply(lambda x:x.split('|')[0])
len(np.unique(test['SYMBOL']))
# 16939
col = []
for i in exp_matrix.keys():
    if '_' not in i:continue
    col.append(i)
col = ['SYMBOL']+col
len(col)
# 767
Out[975]:
(16927, 768)

注意这里去掉 ERCC_的写法,为了后续用简答的 nromalization method

In [ ]:
exp_matrix = exp_matrix[exp_matrix['SYMBOL'].apply(lambda x:x[:5]!='ERCC_')]
exp_matrix.shape
# (16988, 767)
# (16927, 768)
exp_matrix = exp_matrix[col]
exp_matrix.set_index('SYMBOL',inplace=True)
exp_matrix = exp_matrix.T
### 输出到文件
exp_matrix.to_csv('GSE160048_human_glom_single_cell_rpkms.format.tsv',sep='\t',index=None)
In [976]:
exp_matrix.describe()
Out[976]:
311s_O3 680s_K20 311s_M10 680s_A15 680s_N21 679s_N10 680s_M18 679s_C6 680s_K22 677s_M4 ... 677s_H19 240s_H20 240s_J5 001s_L23 001s_L19 240s_L21 240s_I19 240s_K11 240s_J17 240s_H15
count 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 ... 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000 16927.000000
mean 32.156925 34.266216 33.363815 33.930566 30.951951 32.934724 33.560113 30.089261 34.258939 30.245884 ... 34.861247 30.436970 28.496192 25.155460 25.845597 30.984374 28.205357 28.834901 20.969901 22.527850
std 469.894067 451.367603 592.163159 511.142264 403.026450 403.083919 444.187535 286.115207 374.318364 396.650838 ... 218.762296 289.239672 411.470533 277.207956 273.903431 470.436317 389.971261 405.108314 177.792873 175.201954
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 14.167700 0.000000 0.000000 0.000000 0.000000 0.000000 0.139117 0.000000 0.000000 0.000000
max 48953.136346 43755.818769 62923.952129 47040.152902 36571.252220 40706.548873 32864.137879 12089.624407 25786.732160 32938.405719 ... 13301.093373 26605.050080 39259.913294 28340.673221 27516.962841 47286.931637 33934.670516 40665.176997 7203.074943 7968.179295

8 rows × 766 columns

读入格式化的 GEO 表达矩阵

In [986]:
adata = sc.read_csv(
    'GSE160048_human_glom_single_cell_rpkms.format.tsv',  # the directory with the `.mtx` file
    delimiter='\t', 
    first_column_names='SYMBOL',
    dtype= 'float32',
    # use gene symbols for the variable names (variables-axis index)
) 
adata

adata.var_names_make_unique()
adata.obs_names_make_unique()
adata

# sc.pl.highest_expr_genes(adata, n_top=20, )

# sc.pp.filter_cells(adata, min_genes=200)
# sc.pp.filter_genes(adata, min_cells=3)
### 因为不能再丢数据了,所以不做额外过滤
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.

QC 质量控制

In [987]:
adata = adata[adata.obs.n_genes_by_counts < 7000, :]
# sc.pp.normalize_total(adata, target_sum=1e4)
# Total-count normalize (library-size correct) the data matrix 𝐗 to 10,000 reads per cell, so that counts become comparable among cells.
sc.pp.log1p(adata,base=2)

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

sc.pl.highly_variable_genes(adata)
In [988]:
adata.raw = adata

adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pl.pca(adata)
sc.pl.pca_variance_ratio(adata, log=True)

umap 的使用方法,注意参数的选择

In [989]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

sc.tl.umap(adata)
sc.tl.leiden(adata)
adata
Out[989]:
AnnData object with n_obs × n_vars = 765 × 3872
    obs: 'n_genes_by_counts', 'total_counts', 'leiden'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

细胞分群

In [1029]:
# import matplotlib.pyplot as plt
plt.rcParams['axes.ymargin'] = 0
plt.rcParams['axes.spines.left']   = True
plt.rcParams['axes.spines.right']  = False
plt.rcParams['axes.spines.top']    = False
plt.rcParams['axes.spines.bottom'] = True
In [1030]:
for i in exp_matrix.keys():
    if 'CD4' in i:
        print(i)
sc.settings.set_figure_params(dpi=60, facecolor='white')
sc.pl.umap(adata, 
color=[
'n_genes_by_counts',
'NPHS2','PODXL','NPHS1','NPHS2','IGFBP7', #podocyte
'leiden'
],
size=80,
ncols=5,
use_raw=True,
)

# cell_type_anno = {}
# cell_type_anno['Podo'] = ['7']
In [996]:
adata.obs['leiden'].value_counts()
Out[996]:
0    190
1    150
2    109
3     96
4     61
5     56
6     45
7     27
8     24
9      7
Name: leiden, dtype: int64
In [1009]:
sc.pl.umap(adata, 
color=[
'n_genes_by_counts',
'PECAM1','SPP1','leiden'
],
# size=100,
ncols=5,use_raw=True,legend_loc='on data',palette='Set3')
###

# cell_type_anno['Tubules'] = ['3']

细胞亚群信息写入单细胞文件

In [1011]:
cell_type_anno = {}
cell_type_anno['Podo'    ] = ['7']
cell_type_anno['Tubules' ] = ['3']
cell_type_anno['T&NK'    ] = ['5']
cell_type_anno['Mφ&Μono'] = ['1','9','2']
cell_type_anno['MLC'     ] = ['4'] #PDGFRB
cell_type_anno['GEC'     ] = ['0','8','6']

# !DCT: distal convoluted tubule;
# !GEC: glomerular endothelial cells
# !MLC: mesangial-like cell;
# !MNP: mononuclear phagocyte;
# !PTC: proximal tubule cell;
# !T+NK: T cell and NK cells
# !cTAL: cortical thick ascending limb
# !CD: collecting duct;

leiden_cell_type = {}
for i,j in cell_type_anno.items():
    for l in j:
        leiden_cell_type[l] = i
adata.obs['Cell_subtype'] = adata.obs.leiden.apply(lambda x:leiden_cell_type[x])
7 Podo
3 Tubules
5 T&NK
1 Mφ&Μono
9 Mφ&Μono
2 Mφ&Μono
4 MLC
0 GEC
8 GEC
6 GEC
In [1026]:
adata.obs['Cell_subtype'].value_counts()
Out[1026]:
Mφ&Μono    266
GEC        259
Tubules     96
MLC         61
T&NK        56
Podo        27
Name: Cell_subtype, dtype: int64
In [998]:
# X_tsne_list = []
# for i,j in adata.obs.iterrows():
#     ### 把原来的位置都保存进来
#     X_tsne_list.append([j.Global_tSNE_1,j.Global_tSNE_2,j.Sub_tSNE_1,j.Sub_tSNE_2])
# #     X_tsne_list.append([j.Sub_tSNE_1,j.Sub_tSNE_2])
# replace_X_tsne = np.array(X_tsne_list)
# type(replace_X_tsne)
# adata.obsm['X_tsne']=replace_X_tsne

# adata.obs['leiden'].value_counts()

T&NK

In [809]:
# help(sc.pl.umap)

强烈注意这里 raw 的写入的用法,避免丢失基因表达数据

In [1031]:
sc.pl.umap(adata, 
color=[
'NPHS2', #podocyte
'PTPRC',# CD45
'CD3D','CD8A',# T cell
# 'leiden'
'Cell_subtype'
],
size=100,
ncols=5,
use_raw=True,palette='Set3',legend_loc='on data')

#######
# cell_type_anno['T&NK'] = ['5']


sc.pl.umap(adata, 
color=['TNF','TGFB1','IL6','Cell_subtype'],
ncols=5,
use_raw=True,palette='Set3',legend_loc='on data')

sc.pl.umap(adata, 
color=['Cell_subtype'],
ncols=5,
use_raw=True,palette='Set3',legend_loc='on data')

Mφ&Μono

In [1045]:
sc.settings.set_figure_params(dpi=60, facecolor='white',fontsize=16,figsize=(5, 5))
sc.pl.umap(adata, color=['leiden'],ncols=4,size=100,legend_loc='on data',palette='Set3')


# cell_type_anno['Mφ&Μono'] = ['1','9','2']
# cell_type_anno['MLC'] = ['4'] #PDGFRB

sc.pl.umap(adata, 
color=[
'n_genes_by_counts',
'NPHS2', #podocyte
'CD3D','PTPRC',# T cell
'TNF',
'FCN1','CD14',
'FCGR3A',# CD16
'FCGR2A','FCGR2B',
'FCAR',# FCAR CD89 (IgA1,2 receptor)
'MRC1',# CD206
'CD86','CD163',
'C1QA',#'C1QB','C1QC',
'MERTK',
'CSF1',
'SPP1',
'PDGFRB', #MLC
'FHL2',# MC, mesangial cell;
],
# size=80,
ncols=5,
use_raw=True
          )
In [1033]:
# cell_type_anno['GEC'] = ['0','8','6']

sc.pl.umap(adata, 
color=[
'n_genes_by_counts',
'PECAM1','leiden'
],
# size=100,
ncols=5,use_raw=True,legend_loc='on data',palette='Set3')
In [1046]:
sc.settings.set_figure_params(dpi=60, facecolor='white',fontsize=16,figsize=(5, 5))

sc.pl.umap(adata, 
color=[
'n_genes_by_counts',
'NPHS2', #podocyte
'CD3D','PTPRC',# T cell
'TNF',
'FCGR2A','FCGR2B',
'FCAR',# FCAR CD89 (IgA1,2 receptor)
'CD86','CD163',
'PECAM1',
'SPP1',
'PDGFRB',
'FHL2',# MC, mesangial cell;
'NT5E','PDGFRB',
'CLDN4','AQP3',## PC, principal cells;
'CUBN','SLC12A3'
],
# size=100,
ncols=5,
use_raw=True
          )

h5ad写入文件

In [1047]:
adata.write('pmid33837218-GSE160048.h5ad')
In [1073]:
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
         adata,
        {'genes':['IL6','TNF','TGFB1'
#             'Trem2','Adgre1',
           ]}, 
        groupby='Cell_subtype',
        return_fig=True,#title='tissue:'+t,
        mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
#         dot_max=0.5,
#         dot_min=0,
#         vmin=0,
#         vmax=1.2,
    ).add_totals(color='black',size=0.5).show()


sc.pl.dotplot(
         adata,'IL6',
        groupby='Cell_subtype',
#         return_fig=True,#title='tissue:'+t,
        mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
#         dot_max=0.5,
#         dot_min=0,
#         vmin=0,
#         vmax=1.2,
    )
In [1074]:
sc.pl.dotplot(adata,['IL6','MKI67'],groupby='Cell_subtype',
              mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True,
        dot_max=0.15,
        dot_min=0,
             )
sc.pl.dotplot(
    adata,
    ['TNF','TNFRSF1A','TNFRSF1B','FCAR'],
    groupby='Cell_subtype',mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True)
In [1075]:
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
         adata,'n_genes_by_counts', 
        groupby='Cell_subtype',
        return_fig=True,#title='tissue:'+t,
        mean_only_expressed=True
#         dot_max=0.5,
#         dot_min=0,
#         vmin=0,
#         vmax=1.2,
    ).add_totals(color='black',size=0.5).show()
In [1076]:
# ?sc.pl.dotplot
In [1078]:
sc.pl.dotplot(adata,'MKI67',groupby='Cell_subtype',
              mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True,
        dot_max=0.15,
        dot_min=0,
             )